import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
import plotly.express as px
from sklearn import linear_model
import statsmodels.formula.api as smf
from sklearn.cluster import KMeans
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
pd.set_option('display.max_columns', None)
import plotly.io as pio
pio.renderers.default = "notebook+plotly_mimetype+png+jpeg+svg+pdf"
# Display all rows
pd.set_option('display.max_rows', None)
# Using the data tables from the FBREF websiter, I will be getting the
# general statistics for the 5 major leagues in Europe
df_PL = pd.read_csv('PLStandard.csv', delimiter=',',skiprows=1)
df_LL = pd.read_csv('LLStandard.csv', delimiter=',',skiprows=1)
df_BL = pd.read_csv('BLStandard.csv', delimiter=',',skiprows=1)
df_SA = pd.read_csv('SAStandard.csv', delimiter=',',skiprows=1)
df_L1 = pd.read_csv('L1Standard.csv', delimiter=',',skiprows=1)
df_list = [df_PL, df_LL, df_BL, df_SA, df_L1]
# Cleaning the data by renaming the columns into more understandable names
# x before the stat means expected stat (ex: xG = expected goals) and G means goals, A means
# assists, PK means penalty kicks, npxG means non penalty kicks, PrgC means progressive carries,
# PrgP means progressive passes, 90 means per 90 minutes
column_names = ['Squad', '#OfPlayers', 'Age', 'Poss', 'MatchesPlayed', 'Starts', 'Min', '90s', 'Goals',
'Asists', 'G+A', 'G-PK', 'PK', 'PKAttempts', 'YCard', 'RCard', 'xG', 'NonPKxG',
'xAG', 'npxG+xAG', 'PrgC', 'PrgP', 'G/90', 'A/90', 'G+A/90', 'G-PK/90',
'G+A-PK/90', 'xG/90', 'xAG/90', 'xG+xAG/90', 'npxG/90', 'npxG+xAG/90']
df_PL.columns = column_names
for df in df_list:
df.columns = column_names
# Adding a column for the league name
df_PL["League"] = "Premier Leauge"
df_LL["League"] = "La Liga"
df_BL["League"] = "Bundesliga"
df_SA["League"] = "Serie A"
df_L1["League"] = "Ligue 1"
# Combining all the dataframes into one
df_Gen = pd.concat([df_PL, df_LL, df_BL, df_SA, df_L1], ignore_index=True)
df_Gen.head(5)
| Squad | #OfPlayers | Age | Poss | MatchesPlayed | Starts | Min | 90s | Goals | Asists | G+A | G-PK | PK | PKAttempts | YCard | RCard | xG | NonPKxG | xAG | npxG+xAG | PrgC | PrgP | G/90 | A/90 | G+A/90 | G-PK/90 | G+A-PK/90 | xG/90 | xAG/90 | xG+xAG/90 | npxG/90 | npxG+xAG/90 | League | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Arsenal | 26 | 25.4 | 59.9 | 34 | 374 | 3060 | 34.0 | 78 | 59 | 137 | 75 | 3 | 4 | 47 | 0 | 66.3 | 63.5 | 48.8 | 112.3 | 744 | 1877 | 2.29 | 1.74 | 4.03 | 2.21 | 3.94 | 1.95 | 1.44 | 3.38 | 1.87 | 3.30 | Premier Leauge |
| 1 | Aston Villa | 26 | 27.8 | 49.6 | 34 | 374 | 3060 | 34.0 | 44 | 31 | 75 | 41 | 3 | 3 | 65 | 1 | 43.4 | 41.1 | 33.4 | 74.5 | 548 | 1120 | 1.29 | 0.91 | 2.21 | 1.21 | 2.12 | 1.28 | 0.98 | 2.26 | 1.21 | 2.19 | Premier Leauge |
| 2 | Bournemouth | 31 | 27.1 | 39.9 | 34 | 374 | 3060 | 34.0 | 36 | 23 | 59 | 36 | 0 | 0 | 61 | 0 | 35.6 | 35.6 | 25.8 | 61.3 | 452 | 887 | 1.06 | 0.68 | 1.74 | 1.06 | 1.74 | 1.05 | 0.76 | 1.80 | 1.05 | 1.80 | Premier Leauge |
| 3 | Brentford | 25 | 27.0 | 43.9 | 34 | 374 | 3060 | 34.0 | 50 | 30 | 80 | 43 | 7 | 8 | 49 | 1 | 50.6 | 44.4 | 33.6 | 78.0 | 332 | 1024 | 1.47 | 0.88 | 2.35 | 1.26 | 2.15 | 1.49 | 0.99 | 2.48 | 1.31 | 2.29 | Premier Leauge |
| 4 | Brighton | 27 | 27.4 | 60.2 | 32 | 352 | 2880 | 32.0 | 58 | 38 | 96 | 52 | 6 | 6 | 43 | 0 | 61.8 | 57.5 | 43.7 | 101.2 | 680 | 1579 | 1.81 | 1.19 | 3.00 | 1.62 | 2.81 | 1.93 | 1.37 | 3.30 | 1.80 | 3.16 | Premier Leauge |
# Offense Data
df_PL_Off = pd.read_csv('PLOff.csv', delimiter=',',skiprows=1)
df_LL_Off = pd.read_csv('LLOff.csv', delimiter=',',skiprows=1)
df_BL_Off = pd.read_csv('BLOff.csv', delimiter=',',skiprows=1)
df_SA_Off = pd.read_csv('SAOff.csv', delimiter=',',skiprows=1)
df_L1_Off = pd.read_csv('L1Off.csv', delimiter=',',skiprows=1)
df_list_Off = [df_PL_Off, df_LL_Off, df_BL_Off, df_SA_Off, df_L1_Off]
column_names_Off = ['Squad', '#OfPlayers', '90s', "Shot_Creating_Actions", "SCA/90", "PassLive",
"PassDead", "TakeOnsTS", "ShotTS", "FoulTS", "DefTS", "Goal_Creating_Actions",
"GCA/90", "PassLiveG", "PassDeadG", "TakeOnG", "ShotG", "FoulG", "DefG"]
for value in df_list_Off:
value.columns = column_names_Off
# Cleaning the data frame by droping unnessiary columns
value.drop(["PassLiveG", "PassDeadG", "TakeOnG", "ShotG", "FoulG", "DefG", '#OfPlayers', '90s'], axis=1, inplace=True)
# Creating a new dataframe with the offense data
df_Off = pd.concat(df_list_Off, ignore_index=True)
df_Off.head(5)
| Squad | Shot_Creating_Actions | SCA/90 | PassLive | PassDead | TakeOnsTS | ShotTS | FoulTS | DefTS | Goal_Creating_Actions | GCA/90 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Arsenal | 982 | 28.06 | 723 | 75 | 54 | 72 | 41 | 17 | 141 | 4.03 |
| 1 | Aston Villa | 701 | 20.03 | 534 | 46 | 43 | 37 | 32 | 9 | 74 | 2.11 |
| 2 | Bournemouth | 577 | 16.49 | 417 | 42 | 50 | 41 | 21 | 6 | 64 | 1.83 |
| 3 | Brentford | 627 | 17.91 | 424 | 89 | 28 | 37 | 44 | 5 | 86 | 2.46 |
| 4 | Brighton | 905 | 28.28 | 688 | 62 | 68 | 52 | 24 | 11 | 100 | 3.12 |
# Defense Data
df_PL_D = pd.read_csv('PLDef.csv', delimiter=',',skiprows=1)
df_LL_D = pd.read_csv('LLDef.csv', delimiter=',', skiprows=1)
df_BL_D = pd.read_csv('BLDef.csv', delimiter=',', skiprows=1)
df_SA_D = pd.read_csv('SADef.csv', delimiter=',', skiprows=1)
df_L1_D = pd.read_csv('L1Def.csv', delimiter=',',skiprows=1)
d_column_names = ['Squad', '#Pl', '90s', 'Tackles', 'TacklesW', 'Def_3rd', 'Mid_3rd', 'Att_3rd',
'Tkl', 'Att', 'Tkl%', 'Lost', 'Blocks', 'ShotBlock', 'PassBlock',
'Interceptions', 'Tkl+Int', 'Clrearances', 'Errors']
for dfvalue in [df_PL_D, df_LL_D, df_BL_D, df_SA_D, df_L1_D]:
dfvalue.columns = d_column_names
dfvalue.drop(['#Pl', '90s', 'Tkl+Int', 'ShotBlock', 'PassBlock', 'Tkl%',"Lost"], axis=1, inplace=True)
# Creating a new dataframe with the defense data
df_Def = pd.concat([df_PL_D, df_LL_D, df_BL_D, df_SA_D, df_L1_D], ignore_index=True)
df_Def.head(5)
| Squad | Tackles | TacklesW | Def_3rd | Mid_3rd | Att_3rd | Tkl | Att | Blocks | Interceptions | Clrearances | Errors | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Arsenal | 500 | 302 | 213 | 185 | 102 | 215 | 424 | 320 | 214 | 540 | 19 |
| 1 | Aston Villa | 569 | 302 | 278 | 223 | 68 | 284 | 574 | 401 | 278 | 633 | 12 |
| 2 | Bournemouth | 558 | 338 | 282 | 226 | 50 | 226 | 526 | 451 | 310 | 918 | 9 |
| 3 | Brentford | 528 | 303 | 245 | 209 | 74 | 230 | 544 | 393 | 314 | 745 | 11 |
| 4 | Brighton | 538 | 297 | 236 | 211 | 91 | 232 | 461 | 336 | 268 | 368 | 14 |
df_5L = pd.read_csv('5L.csv', delimiter=',')
# Removing the name of the top scorer because that is not needed. However, I will be keeping
# number of the goals scored by the top scorer becuase that
# shows how the team's best player is performing which could be a factor in how the team performes
df_5L['Top_Scorer_Goals'] = df_5L['Top Team Scorer'].str.split(' - ', expand=True)[1]
df_5L['Top_Scorer_Goals'] = df_5L['Top_Scorer_Goals'].astype(float)
# Removing the columns that have repeat data or data that I will not use
df_5L.drop(columns=['Last 5','Country','GF','xG','xGA', 'xGD', 'xGD/90',
'Top Team Scorer', 'MP', 'Rk', 'Goalkeeper'],axis=1,inplace=True)
df_5L.head(5)
| Squad | LgRk | W | D | L | GA | GD | Pts | Pts/MP | Attendance | Top_Scorer_Goals | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Barcelona | 1 | 26 | 4 | 3 | 11 | 49 | 82 | 2.48 | 82920 | 19.0 |
| 1 | Napoli | 1 | 25 | 5 | 3 | 23 | 46 | 80 | 2.42 | 24058 | 22.0 |
| 2 | Manchester City | 1 | 25 | 4 | 4 | 30 | 57 | 79 | 2.39 | 53226 | 35.0 |
| 3 | Arsenal | 2 | 24 | 6 | 4 | 39 | 42 | 78 | 2.29 | 60200 | 15.0 |
| 4 | Paris S-G | 1 | 24 | 3 | 6 | 34 | 42 | 75 | 2.27 | 38125 | 23.0 |
# Method to get the data from the website for the market value of the teams for each of the leagues
def getTeamWorth(url, headers):
# Using beautiful soup to get the HTML from the website
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
# Using beautiful soup methods to get the correct HTML tags that are reuired to scrape the data
tab = soup.find("table",{"class":"items"})
tab
values = tab.findChildren('tbody')
values
odd_rows = values[0].find_all("tr", {"class": "odd"})
even_rows = values[0].find_all("tr", {"class": "even"})
rows = odd_rows + even_rows
teams = {}
# For each of the teams in the wepage I am isolating the team name and the market value of the team
for row in rows:
team_name = row.find('td', {'class': 'hauptlink no-border-links'}).a.text
team_value = row.find_all('td', {'class': 'rechts'})[1].text
teams[team_name] = team_value
return teams
url_pl = 'https://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1'
url_ll = 'https://www.transfermarkt.com/primera-division/startseite/wettbewerb/ES1'
url_bl = 'https://www.transfermarkt.com/bundesliga/startseite/wettbewerb/L1'
url_sa = 'https://www.transfermarkt.com/serie-a/startseite/wettbewerb/IT1'
url_l1 = 'https://www.transfermarkt.com/ligue-1/startseite/wettbewerb/FR1'
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/112.0.0.0 Safari/537.36"}
# Creating a dataframe for each leauge with the market value of the teams for each of the leagues
PL_Worth = pd.DataFrame(list(getTeamWorth(url_pl, headers).items()), columns=['Squad', 'Worth'])
LL_Worth = pd.DataFrame(list(getTeamWorth(url_ll, headers).items()), columns=['Squad', 'Worth'])
BL_Worth = pd.DataFrame(list(getTeamWorth(url_bl, headers).items()), columns=['Squad', 'Worth'])
SA_Worth = pd.DataFrame(list(getTeamWorth(url_sa, headers).items()), columns=['Squad', 'Worth'])
L1_Worth = pd.DataFrame(list(getTeamWorth(url_l1, headers).items()), columns=['Squad', 'Worth'])
# Putting the dataframes for each of the leagues into one dataframe and isolating the 2 columns
# that I need
df_worth = pd.concat([PL_Worth, LL_Worth, BL_Worth, SA_Worth, L1_Worth])
df_worth = df_worth[['Squad','Worth']]
df_worth.head(5)
| Squad | Worth | |
|---|---|---|
| 0 | Manchester City | €1.05bn |
| 1 | Arsenal FC | €890.00m |
| 2 | Manchester United | €795.70m |
| 3 | Newcastle United | €494.30m |
| 4 | Leicester City | €443.10m |
squad_map = {
'1.FC Köln': 'Köln',
'1.FC Union Berlin': 'Union Berlin',
'1.FSV Mainz 05': 'Mainz 05',
'AC Ajaccio': 'Ajaccio',
'AC Milan': 'Milan',
'AC Monza': 'Monza',
'ACF Fiorentina': 'Fiorentina',
'AFC Bournemouth': 'Bournemouth',
'AJ Auxerre': 'Auxerre',
'AS Monaco': 'Monaco',
'AS Roma': 'Roma',
'Angers SCO': 'Angers',
'Arsenal FC': 'Arsenal',
'Aston Villa': 'Aston Villa',
'Atalanta BC': 'Atalanta',
'Athletic Bilbao': 'Athletic Club',
'Atlético de Madrid': 'Atlético Madrid',
'Bayer 04 Leverkusen': 'Leverkusen',
'Bayern Munich': 'Bayern Munich',
'Bologna FC 1909': 'Bologna',
'Borussia Dortmund': 'Dortmund',
'Borussia Mönchengladbach': "M'Gladbach",
'Brentford FC': 'Brentford',
'Brighton & Hove Albion': 'Brighton',
'CA Osasuna': 'Osasuna',
'Celta de Vigo': 'Celta Vigo',
'Chelsea FC': 'Chelsea',
'Clermont Foot 63': 'Clermont Foot',
'Crystal Palace': 'Crystal Palace',
'Cádiz CF': 'Cádiz',
'ESTAC Troyes': 'Troyes',
'Eintracht Frankfurt': 'Eint Frankfurt',
'Elche CF': 'Elche',
'Everton FC': 'Everton',
'FC Augsburg': 'Augsburg',
'FC Barcelona': 'Barcelona',
'FC Empoli': 'Empoli',
'FC Lorient': 'Lorient',
'FC Nantes': 'Nantes',
'FC Schalke 04': 'Schalke 04',
'FC Toulouse': 'Toulouse',
'Fulham FC': 'Fulham',
'Getafe CF': 'Getafe',
'Girona FC': 'Girona',
'Hellas Verona': 'Hellas Verona',
'Hertha BSC': 'Hertha BSC',
'Inter Milan': 'Inter',
'Juventus FC': 'Juventus',
'LOSC Lille': 'Lille',
'Leeds United': 'Leeds United',
'Leicester City': 'Leicester City',
'Liverpool FC': 'Liverpool',
'Manchester City': 'Manchester City',
'Manchester United': 'Manchester Utd',
'Montpellier HSC': 'Montpellier',
'Newcastle United': 'Newcastle Utd',
'Nottingham Forest': "Nott'ham Forest",
'OGC Nice': 'Nice',
'Olympique Lyon': 'Lyon',
'Olympique Marseille': 'Marseille',
'Paris Saint-Germain': 'Paris S-G',
'RB Leipzig': 'RB Leipzig',
'RC Lens': 'Lens',
'RC Strasbourg Alsace': 'Strasbourg',
'RCD Espanyol Barcelona': 'Espanyol',
'RCD Mallorca': 'Mallorca',
'Real Betis Balompié': 'Betis',
'Real Madrid': 'Real Madrid',
'Real Sociedad': 'Real Sociedad',
'Real Valladolid CF': 'Valladolid',
'Red Bull Salzburg': 'RB Salzburg',
'SC Freiburg': 'Freiburg',
'SD Eibar': 'Eibar',
'SSC Napoli': 'Napoli',
'SV Werder Bremen': 'Werder Bremen',
'Sheffield United': 'Sheffield Utd',
'Southampton FC': 'Southampton',
'Stade Brestois 29': 'Brest',
'Stade Rennais FC': 'Rennes',
'Stade Reims': 'Reims',
'Standard de Liège': 'Standard Liège',
'TSG 1899 Hoffenheim': 'Hoffenheim',
'Tottenham Hotspur': 'Tottenham',
'UD Almería': 'Almería',
'UD Las Palmas': 'Las Palmas',
'US Lecce': 'Lecce',
'Valencia CF': 'Valencia',
'VfB Stuttgart': 'Stuttgart',
'VfL Bochum': 'Bochum',
'VfL Wolfsburg': 'Wolfsburg',
'Villarreal CF': 'Villarreal',
'West Bromwich Albion': 'West Brom',
'West Ham United': 'West Ham',
'Wolverhampton Wanderers': 'Wolves',
'Águilas Doradas': 'Águilas Doradas',
'US Cremonese' : 'Cremonese',
'SS Lazio' : 'Lazio',
'US Salernitana 1919' : 'Salernitana',
'UC Sampdo' : 'Sampdo',
'US Sassuolo' : 'Sassuolo',
'Sevilla FC' : 'Sevilla',
'Spezia Calcio' :'Spezia',
'Torino FC' : 'Torino',
'Udinese Calcio' : 'Udinese',
'UC Sampdoria' : 'Sampdoria'
}
df_worth['Squad'] = df_worth['Squad'].map(squad_map).fillna(df_worth['Squad'])
df_worth.head(5)
| Squad | Worth | |
|---|---|---|
| 0 | Manchester City | €1.05bn |
| 1 | Arsenal | €890.00m |
| 2 | Manchester Utd | €795.70m |
| 3 | Newcastle Utd | €494.30m |
| 4 | Leicester City | €443.10m |
# Merging the general and defense dataframes
df_merge1 = pd.merge(df_Gen, df_Def, on="Squad", suffixes=('', ''))
# Merging the dataframe with the information about the 5 leagues with the dataframe with the general and defense data
df_merge2 = pd.merge(df_merge1, df_5L, on="Squad", suffixes=('', ''))
# Merging the offense dataframe with the dataframe with the information about the 5 league along with the general and defense data
df_merge3 = pd.merge(df_Off, df_merge2, on="Squad", suffixes=('', ''))
# Merging the dataframe with the information about the squad's worth with the dataframe with all the other information
df = pd.merge(df_worth, df_merge3, on="Squad", suffixes=('', ''))
def parse_worth(worth):
if worth.startswith('€') and worth.endswith('m'):
return float(worth[1:-1]) * 1000000
elif worth.startswith('€') and worth.endswith('bn'):
return float(worth[1:-2]) * 1000000000
else:
return np.nan
df['Worth'] = df['Worth'].apply(parse_worth)
df.sample(n=20)
| Squad | Worth | Shot_Creating_Actions | SCA/90 | PassLive | PassDead | TakeOnsTS | ShotTS | FoulTS | DefTS | Goal_Creating_Actions | GCA/90 | #OfPlayers | Age | Poss | MatchesPlayed | Starts | Min | 90s | Goals | Asists | G+A | G-PK | PK | PKAttempts | YCard | RCard | xG | NonPKxG | xAG | npxG+xAG | PrgC | PrgP | G/90 | A/90 | G+A/90 | G-PK/90 | G+A-PK/90 | xG/90 | xAG/90 | xG+xAG/90 | npxG/90 | npxG+xAG/90 | League | Tackles | TacklesW | Def_3rd | Mid_3rd | Att_3rd | Tkl | Att | Blocks | Interceptions | Clrearances | Errors | LgRk | W | D | L | GA | GD | Pts | Pts/MP | Attendance | Top_Scorer_Goals | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 39 | Elche | 60700000.0 | 605 | 18.33 | 413 | 87 | 35 | 32 | 28 | 10 | 45 | 1.36 | 32 | 29.2 | 43.8 | 33 | 363 | 2970 | 33.0 | 25 | 18 | 43 | 21 | 4 | 5 | 90 | 12 | 31.0 | 27.1 | 22.1 | 49.2 | 466 | 986 | 0.76 | 0.55 | 1.30 | 0.64 | 1.18 | 0.94 | 0.67 | 1.61 | 0.82 | 1.49 | La Liga | 563 | 319 | 323 | 196 | 44 | 254 | 526 | 359 | 283 | 643 | 11 | 20 | 3 | 7 | 23 | 64 | -39 | 16 | 0.48 | 20350 | 6.0 |
| 29 | Cádiz | 62400000.0 | 558 | 16.91 | 390 | 61 | 31 | 31 | 30 | 15 | 42 | 1.27 | 37 | 29.8 | 41.5 | 33 | 363 | 2970 | 33.0 | 25 | 17 | 42 | 22 | 3 | 3 | 94 | 7 | 32.6 | 30.4 | 22.4 | 52.8 | 409 | 825 | 0.76 | 0.52 | 1.27 | 0.67 | 1.18 | 0.99 | 0.68 | 1.67 | 0.92 | 1.60 | La Liga | 559 | 364 | 327 | 178 | 54 | 288 | 593 | 385 | 229 | 763 | 8 | 15 | 8 | 11 | 14 | 49 | -23 | 35 | 1.06 | 17554 | 3.0 |
| 4 | Leicester City | 443100000.0 | 679 | 19.97 | 495 | 48 | 44 | 46 | 32 | 14 | 78 | 2.29 | 28 | 26.9 | 49.0 | 34 | 374 | 3060 | 34.0 | 44 | 32 | 76 | 42 | 2 | 4 | 57 | 2 | 45.9 | 42.7 | 36.1 | 78.9 | 530 | 1209 | 1.29 | 0.94 | 2.24 | 1.24 | 2.18 | 1.35 | 1.06 | 2.41 | 1.26 | 2.32 | Premier Leauge | 622 | 372 | 303 | 260 | 59 | 284 | 612 | 458 | 328 | 684 | 10 | 16 | 8 | 6 | 20 | 59 | -13 | 30 | 0.88 | 31850 | 10.0 |
| 7 | Nott'ham Forest | 332650000.0 | 545 | 16.03 | 386 | 50 | 28 | 34 | 34 | 13 | 40 | 1.18 | 33 | 27.3 | 38.7 | 34 | 374 | 3060 | 34.0 | 28 | 16 | 44 | 26 | 2 | 5 | 77 | 0 | 35.2 | 31.3 | 23.8 | 55.2 | 405 | 796 | 0.82 | 0.47 | 1.29 | 0.76 | 1.24 | 1.04 | 0.70 | 1.74 | 0.92 | 1.62 | Premier Leauge | 574 | 342 | 324 | 183 | 67 | 264 | 514 | 456 | 302 | 849 | 6 | 18 | 7 | 9 | 18 | 62 | -32 | 30 | 0.88 | 29171 | 8.0 |
| 48 | Schalke 04 | 60400000.0 | 648 | 20.90 | 437 | 86 | 22 | 49 | 45 | 9 | 50 | 1.61 | 35 | 28.3 | 46.1 | 30 | 330 | 2700 | 30.0 | 26 | 19 | 45 | 23 | 3 | 4 | 59 | 1 | 34.0 | 30.8 | 24.0 | 54.8 | 358 | 816 | 0.87 | 0.63 | 1.50 | 0.77 | 1.40 | 1.13 | 0.80 | 1.93 | 1.03 | 1.83 | Bundesliga | 491 | 293 | 221 | 221 | 49 | 226 | 489 | 358 | 268 | 632 | 9 | 14 | 7 | 9 | 15 | 59 | -28 | 30 | 0.97 | 61105 | 11.0 |
| 35 | Osasuna | 127700000.0 | 665 | 20.15 | 482 | 69 | 35 | 40 | 27 | 12 | 54 | 1.64 | 26 | 28.3 | 48.1 | 33 | 363 | 2970 | 33.0 | 28 | 22 | 50 | 24 | 4 | 5 | 86 | 7 | 33.6 | 29.7 | 22.9 | 52.6 | 456 | 1179 | 0.85 | 0.67 | 1.52 | 0.73 | 1.39 | 1.02 | 0.69 | 1.71 | 0.90 | 1.59 | La Liga | 460 | 278 | 223 | 168 | 69 | 224 | 502 | 331 | 269 | 646 | 10 | 10 | 12 | 8 | 13 | 35 | -6 | 44 | 1.33 | 19483 | 7.0 |
| 24 | Sevilla | 209000000.0 | 738 | 22.36 | 562 | 68 | 19 | 35 | 37 | 17 | 71 | 2.15 | 33 | 29.5 | 54.4 | 33 | 363 | 2970 | 33.0 | 41 | 26 | 67 | 37 | 4 | 5 | 110 | 11 | 40.0 | 36.3 | 26.9 | 63.1 | 486 | 1372 | 1.24 | 0.79 | 2.03 | 1.12 | 1.91 | 1.21 | 0.81 | 2.03 | 1.10 | 1.91 | La Liga | 496 | 312 | 211 | 194 | 91 | 226 | 474 | 296 | 305 | 543 | 9 | 11 | 12 | 8 | 13 | 49 | -8 | 44 | 1.33 | 35559 | 8.0 |
| 33 | Valencia | 225650000.0 | 741 | 22.45 | 515 | 75 | 51 | 39 | 47 | 14 | 53 | 1.61 | 28 | 25.0 | 53.4 | 33 | 363 | 2970 | 33.0 | 34 | 19 | 53 | 29 | 5 | 8 | 81 | 6 | 41.5 | 35.2 | 27.3 | 62.5 | 665 | 1280 | 1.03 | 0.58 | 1.61 | 0.88 | 1.45 | 1.26 | 0.83 | 2.08 | 1.07 | 1.89 | La Liga | 596 | 372 | 298 | 227 | 71 | 268 | 532 | 317 | 235 | 529 | 20 | 17 | 9 | 7 | 17 | 40 | -4 | 34 | 1.03 | 40076 | 5.0 |
| 22 | Villarreal | 278200000.0 | 759 | 23.00 | 571 | 42 | 61 | 48 | 31 | 6 | 70 | 2.12 | 31 | 29.4 | 56.2 | 33 | 363 | 2970 | 33.0 | 46 | 28 | 74 | 42 | 4 | 7 | 77 | 4 | 52.1 | 46.6 | 36.8 | 83.4 | 746 | 1394 | 1.39 | 0.85 | 2.24 | 1.27 | 2.12 | 1.58 | 1.12 | 2.69 | 1.41 | 2.53 | La Liga | 493 | 317 | 242 | 166 | 85 | 231 | 456 | 323 | 227 | 614 | 15 | 5 | 16 | 6 | 11 | 34 | 13 | 54 | 1.64 | 16344 | 7.0 |
| 32 | Betis | 252500000.0 | 647 | 19.61 | 474 | 48 | 48 | 39 | 32 | 6 | 59 | 1.79 | 30 | 29.3 | 50.0 | 33 | 363 | 2970 | 33.0 | 37 | 21 | 58 | 29 | 8 | 9 | 71 | 13 | 45.9 | 38.8 | 30.0 | 68.8 | 536 | 1108 | 1.12 | 0.64 | 1.76 | 0.88 | 1.52 | 1.39 | 0.91 | 2.30 | 1.18 | 2.09 | La Liga | 572 | 356 | 322 | 206 | 44 | 289 | 596 | 361 | 317 | 747 | 6 | 6 | 15 | 7 | 11 | 37 | 3 | 52 | 1.58 | 50034 | 12.0 |
| 57 | Bochum | 47750000.0 | 610 | 19.68 | 414 | 82 | 20 | 38 | 45 | 11 | 56 | 1.81 | 26 | 30.2 | 45.6 | 30 | 330 | 2700 | 30.0 | 31 | 22 | 53 | 28 | 3 | 3 | 54 | 1 | 35.3 | 33.0 | 24.2 | 57.2 | 308 | 894 | 1.03 | 0.73 | 1.77 | 0.93 | 1.67 | 1.18 | 0.81 | 1.98 | 1.10 | 1.91 | Bundesliga | 565 | 349 | 279 | 215 | 71 | 274 | 554 | 358 | 293 | 669 | 12 | 17 | 8 | 4 | 18 | 67 | -34 | 28 | 0.93 | 25330 | 8.0 |
| 89 | Marseille | 259200000.0 | 872 | 25.65 | 629 | 64 | 57 | 66 | 36 | 20 | 90 | 2.65 | 29 | 28.0 | 56.0 | 33 | 363 | 2970 | 33.0 | 55 | 33 | 88 | 50 | 5 | 6 | 55 | 4 | 58.1 | 53.5 | 41.4 | 95.0 | 673 | 1651 | 1.67 | 1.00 | 2.67 | 1.52 | 2.52 | 1.76 | 1.26 | 3.02 | 1.62 | 2.88 | Ligue 1 | 591 | 353 | 235 | 271 | 85 | 251 | 507 | 335 | 380 | 508 | 10 | 2 | 21 | 7 | 5 | 32 | 29 | 70 | 2.12 | 55175 | 13.0 |
| 80 | Nice | 235600000.0 | 774 | 22.76 | 572 | 66 | 55 | 46 | 23 | 12 | 74 | 2.18 | 32 | 27.3 | 51.6 | 33 | 363 | 2970 | 33.0 | 40 | 28 | 68 | 34 | 6 | 7 | 47 | 3 | 47.2 | 41.8 | 32.9 | 74.7 | 662 | 1314 | 1.21 | 0.85 | 2.06 | 1.03 | 1.88 | 1.43 | 1.00 | 2.43 | 1.27 | 2.26 | Ligue 1 | 692 | 394 | 354 | 245 | 93 | 322 | 567 | 367 | 362 | 517 | 8 | 9 | 12 | 12 | 9 | 31 | 9 | 48 | 1.45 | 18770 | 9.0 |
| 41 | RB Leipzig | 487300000.0 | 782 | 25.23 | 567 | 64 | 46 | 58 | 41 | 6 | 92 | 2.97 | 25 | 27.1 | 58.9 | 30 | 330 | 2700 | 30.0 | 54 | 34 | 88 | 50 | 4 | 5 | 60 | 2 | 53.2 | 49.2 | 37.2 | 86.5 | 525 | 1279 | 1.80 | 1.13 | 2.93 | 1.67 | 2.80 | 1.77 | 1.24 | 3.01 | 1.64 | 2.88 | Bundesliga | 463 | 279 | 206 | 199 | 58 | 220 | 445 | 367 | 271 | 469 | 3 | 5 | 16 | 6 | 8 | 37 | 17 | 54 | 1.80 | 45813 | 13.0 |
| 97 | Ajaccio | 27150000.0 | 513 | 15.09 | 331 | 71 | 20 | 31 | 51 | 9 | 37 | 1.09 | 34 | 30.0 | 45.0 | 33 | 363 | 2970 | 33.0 | 21 | 12 | 33 | 15 | 6 | 9 | 78 | 8 | 34.9 | 28.2 | 21.8 | 50.1 | 441 | 1075 | 0.64 | 0.36 | 1.00 | 0.45 | 0.82 | 1.06 | 0.66 | 1.72 | 0.86 | 1.52 | Ligue 1 | 546 | 331 | 260 | 211 | 75 | 263 | 571 | 305 | 372 | 534 | 7 | 19 | 6 | 4 | 23 | 61 | -39 | 22 | 0.67 | 6450 | 6.0 |
| 16 | Everton | 352900000.0 | 651 | 19.15 | 443 | 71 | 33 | 49 | 38 | 17 | 47 | 1.38 | 28 | 27.4 | 43.5 | 34 | 374 | 3060 | 34.0 | 26 | 19 | 45 | 23 | 3 | 3 | 73 | 2 | 38.3 | 35.9 | 26.7 | 62.6 | 447 | 947 | 0.76 | 0.56 | 1.32 | 0.68 | 1.24 | 1.13 | 0.79 | 1.91 | 1.06 | 1.84 | Premier Leauge | 636 | 383 | 328 | 233 | 75 | 267 | 533 | 466 | 338 | 772 | 17 | 19 | 6 | 11 | 17 | 52 | -25 | 29 | 0.85 | 39238 | 5.0 |
| 60 | Atalanta | 329500000.0 | 824 | 24.24 | 609 | 73 | 57 | 46 | 30 | 9 | 99 | 2.91 | 25 | 27.2 | 49.8 | 33 | 363 | 2970 | 33.0 | 55 | 35 | 90 | 49 | 6 | 8 | 77 | 3 | 49.0 | 42.5 | 36.3 | 78.8 | 600 | 1428 | 1.67 | 1.06 | 2.73 | 1.48 | 2.55 | 1.48 | 1.10 | 2.58 | 1.29 | 2.39 | Serie A | 562 | 318 | 263 | 236 | 63 | 279 | 544 | 396 | 367 | 547 | 9 | 5 | 17 | 7 | 9 | 39 | 17 | 58 | 1.76 | 10477 | 13.0 |
| 3 | Newcastle Utd | 494300000.0 | 876 | 25.76 | 590 | 101 | 56 | 67 | 51 | 11 | 95 | 2.79 | 26 | 28.1 | 52.1 | 33 | 363 | 2970 | 33.0 | 58 | 38 | 96 | 54 | 4 | 4 | 53 | 1 | 60.9 | 57.9 | 45.2 | 103.0 | 577 | 1394 | 1.76 | 1.15 | 2.91 | 1.64 | 2.79 | 1.85 | 1.37 | 3.21 | 1.75 | 3.12 | Premier Leauge | 537 | 321 | 237 | 223 | 77 | 272 | 562 | 403 | 291 | 599 | 10 | 3 | 18 | 11 | 4 | 27 | 34 | 65 | 1.97 | 52243 | 15.0 |
| 25 | Getafe | 136700000.0 | 549 | 16.64 | 364 | 74 | 23 | 31 | 51 | 6 | 44 | 1.33 | 25 | 28.1 | 40.9 | 33 | 363 | 2970 | 33.0 | 30 | 18 | 48 | 25 | 5 | 8 | 107 | 9 | 30.9 | 24.7 | 18.8 | 43.5 | 323 | 888 | 0.91 | 0.55 | 1.45 | 0.76 | 1.30 | 0.94 | 0.57 | 1.51 | 0.75 | 1.32 | La Liga | 465 | 281 | 248 | 172 | 45 | 238 | 518 | 349 | 275 | 688 | 9 | 18 | 8 | 10 | 15 | 42 | -12 | 34 | 1.03 | 11342 | 14.0 |
| 49 | Dortmund | 547700000.0 | 846 | 27.29 | 621 | 60 | 62 | 57 | 34 | 12 | 128 | 4.13 | 29 | 26.5 | 57.7 | 30 | 330 | 2700 | 30.0 | 65 | 50 | 115 | 62 | 3 | 3 | 62 | 0 | 55.3 | 53.1 | 42.3 | 95.4 | 639 | 1381 | 2.17 | 1.67 | 3.83 | 2.07 | 3.73 | 1.84 | 1.41 | 3.25 | 1.77 | 3.18 | Bundesliga | 515 | 285 | 223 | 216 | 76 | 258 | 486 | 335 | 273 | 434 | 9 | 2 | 19 | 4 | 7 | 40 | 27 | 61 | 2.03 | 81199 | 8.0 |
df_means = df.groupby('League').mean(numeric_only=True)
df_means
| Worth | Shot_Creating_Actions | SCA/90 | PassLive | PassDead | TakeOnsTS | ShotTS | FoulTS | DefTS | Goal_Creating_Actions | GCA/90 | #OfPlayers | Age | Poss | MatchesPlayed | Starts | Min | 90s | Goals | Asists | G+A | G-PK | PK | PKAttempts | YCard | RCard | xG | NonPKxG | xAG | npxG+xAG | PrgC | PrgP | G/90 | A/90 | G+A/90 | G-PK/90 | G+A-PK/90 | xG/90 | xAG/90 | xG+xAG/90 | npxG/90 | npxG+xAG/90 | Tackles | TacklesW | Def_3rd | Mid_3rd | Att_3rd | Tkl | Att | Blocks | Interceptions | Clrearances | Errors | LgRk | W | D | L | GA | GD | Pts | Pts/MP | Attendance | Top_Scorer_Goals | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| League | |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| Bundesliga | 2.403961e+08 | 676.111111 | 21.809444 | 481.444444 | 65.888889 | 37.944444 | 43.833333 | 36.666667 | 10.333333 | 81.555556 | 2.631667 | 28.055556 | 27.294444 | 50.005556 | 30.0 | 330.0 | 2700.0 | 30.0 | 45.666667 | 32.111111 | 77.777778 | 41.666667 | 4.0 | 5.222222 | 61.055556 | 2.055556 | 42.394444 | 38.316667 | 30.233333 | 68.555556 | 446.388889 | 1072.444444 | 1.522778 | 1.0700 | 2.591667 | 1.388333 | 2.458889 | 1.412222 | 1.008333 | 2.420556 | 1.276667 | 2.2850 | 491.611111 | 290.777778 | 228.666667 | 203.388889 | 59.555556 | 233.50 | 471.833333 | 357.888889 | 280.944444 | 591.444444 | 8.111111 | 9.5 | 11.333333 | 7.555556 | 11.333333 | 47.222222 | 0.0 | 41.555556 | 1.375556 | 42816.888889 | 10.111111 |
| La Liga | 2.392275e+08 | 710.600000 | 21.533000 | 519.200000 | 64.950000 | 40.350000 | 39.300000 | 35.250000 | 11.550000 | 67.500000 | 2.046000 | 29.200000 | 28.160000 | 49.995000 | 33.0 | 363.0 | 2970.0 | 33.0 | 39.700000 | 27.850000 | 67.550000 | 36.000000 | 3.7 | 5.200000 | 84.600000 | 6.300000 | 42.270000 | 38.245000 | 30.445000 | 68.675000 | 570.500000 | 1218.050000 | 1.202500 | 0.8460 | 2.047500 | 1.091500 | 1.934500 | 1.282000 | 0.922500 | 2.203500 | 1.158500 | 2.0805 | 529.850000 | 322.000000 | 261.800000 | 199.000000 | 69.050000 | 248.10 | 521.550000 | 345.450000 | 268.500000 | 610.200000 | 10.200000 | 10.5 | 12.750000 | 7.500000 | 12.750000 | 41.150000 | 0.0 | 45.750000 | 1.385000 | 29520.350000 | 10.150000 |
| Ligue 1 | 1.785250e+08 | 719.950000 | 21.175000 | 522.800000 | 60.800000 | 42.750000 | 44.950000 | 35.400000 | 13.250000 | 78.100000 | 2.297000 | 29.400000 | 27.085000 | 50.005000 | 33.0 | 363.0 | 2970.0 | 33.0 | 44.450000 | 29.700000 | 74.150000 | 39.650000 | 4.8 | 6.100000 | 56.450000 | 4.600000 | 46.340000 | 41.625000 | 32.010000 | 73.640000 | 584.100000 | 1373.900000 | 1.347500 | 0.9000 | 2.247500 | 1.201500 | 2.102500 | 1.404500 | 0.971500 | 2.374500 | 1.260500 | 2.2310 | 591.450000 | 345.550000 | 281.150000 | 231.000000 | 79.300000 | 286.60 | 593.300000 | 371.050000 | 356.350000 | 552.600000 | 10.150000 | 10.5 | 12.450000 | 8.100000 | 12.450000 | 46.200000 | 0.0 | 45.450000 | 1.376500 | 20859.750000 | 12.750000 |
| Premier Leauge | 5.212050e+08 | 754.400000 | 21.957500 | 548.300000 | 65.900000 | 41.300000 | 50.450000 | 36.100000 | 12.350000 | 79.700000 | 2.319000 | 27.800000 | 27.410000 | 50.075000 | 33.7 | 370.7 | 3033.0 | 33.7 | 45.650000 | 31.450000 | 77.100000 | 42.350000 | 3.3 | 4.300000 | 61.950000 | 1.450000 | 47.255000 | 43.995000 | 34.275000 | 78.255000 | 563.400000 | 1276.750000 | 1.356500 | 0.9365 | 2.294000 | 1.259500 | 2.196000 | 1.407000 | 1.020000 | 2.424500 | 1.309000 | 2.3275 | 577.800000 | 337.800000 | 280.950000 | 218.900000 | 77.950000 | 258.25 | 530.850000 | 406.450000 | 303.300000 | 660.250000 | 11.550000 | 10.5 | 13.000000 | 7.700000 | 13.000000 | 47.700000 | 0.0 | 46.700000 | 1.389500 | 40225.700000 | 12.600000 |
| Serie A | 2.312545e+08 | 749.250000 | 22.037000 | 540.850000 | 70.900000 | 40.300000 | 48.300000 | 36.500000 | 12.400000 | 71.100000 | 2.091500 | 29.050000 | 27.380000 | 49.990000 | 33.0 | 363.0 | 2970.0 | 33.0 | 40.400000 | 28.700000 | 69.100000 | 36.700000 | 3.7 | 4.700000 | 74.800000 | 2.800000 | 41.505000 | 37.915000 | 30.260000 | 68.160000 | 563.750000 | 1182.800000 | 1.225000 | 0.8700 | 2.093500 | 1.112000 | 1.981500 | 1.258500 | 0.917000 | 2.173500 | 1.148500 | 2.0660 | 535.700000 | 307.700000 | 261.550000 | 208.850000 | 65.300000 | 243.35 | 490.150000 | 361.300000 | 281.250000 | 588.700000 | 8.300000 | 10.5 | 12.050000 | 8.900000 | 12.050000 | 41.650000 | 0.0 | 45.050000 | 1.365000 | 16325.200000 | 10.300000 |
for index, row in df.iterrows():
if row['League'] == 'Ligue 1':
if row['LgRk'] in range(1,4):
df.at[index, 'CL/REL'] = 'CL'
elif row['LgRk'] == 4:
df.at[index, 'CL/REL'] = 'EL'
elif row['LgRk'] == 5:
df.at[index, 'CL/REL'] = 'EC'
elif row['LgRk'] in range(17,21):
df.at[index, 'CL/REL'] = 'REL'
else:
df.at[index, 'CL/REL'] = 'Other'
elif row['League'] == 'Premier Leauge':
if row['LgRk'] in range(1,5):
df.at[index, 'CL/REL'] = 'CL'
elif row['LgRk'] == 5:
df.at[index, 'CL/REL'] = 'EL'
elif row['LgRk'] in range(18,21):
df.at[index, 'CL/REL'] = 'REL'
else:
df.at[index, 'CL/REL'] = 'Other'
else:
if row['LgRk'] in range(1,5):
df.at[index, 'CL/REL'] = 'CL'
elif row['LgRk'] == 5:
df.at[index, 'CL/REL'] = 'EL'
elif row['LgRk'] == 6:
df.at[index, 'CL/REL'] = 'EC'
elif row['LgRk'] in range(18,21):
df.at[index, 'CL/REL'] = 'REL'
else:
df.at[index, 'CL/REL'] = 'Other'
pd.options.display.max_rows = None
fig = px.histogram(df, x='CL/REL', title='Distribution Of Qualifications For UEFA Tournaments and Relegation')
fig.update_layout(
xaxis=dict(
tickmode='array',
tickvals=list(range(len(df))),
ticktext=["Champions Leauge", "No CL, EL, EC, Relegation", "Relegation", "Europa Leauge", "Europa Confrence"]
)
)
fig.show()
fig = px.scatter(df,x='Goals',y="Pts", hover_data=['Squad'], color="CL/REL", title="Points by League and Number Of Goals")
fig.update_layout(
xaxis_title='Goals',
yaxis_title='Points'
)
fig.show()
std_dev = df.groupby('League')['Pts'].std()
std_dev
League Bundesliga 12.561483 La Liga 15.341293 Ligue 1 16.847536 Premier Leauge 16.300145 Serie A 16.311039 Name: Pts, dtype: float64
fig = px.scatter(df, x='Goals', y='Pts', color='League', trendline='ols', facet_col='League', width=1000, height=600)
fig.update_layout(
title='Scatter Plot of Points vs Goals by League',
xaxis_title='Goals',
yaxis_title='Points'
)
fig.show()
color_dict = {'Premier Leauge': 'blue', 'La Liga': 'red', 'Bundesliga': 'green', 'Serie A': 'purple', 'Ligue 1': 'orange'}
fig = make_subplots(rows=1, cols=3, subplot_titles=('Def_3rd', 'Mid_3rd', 'Att_3rd'),
shared_xaxes=True, shared_yaxes=True)
fig.add_trace(go.Scatter(x=df['Pts'], y=df['Def_3rd'], mode='markers',
name='Defensive 3rd', text=df['Squad'],
hovertemplate='Squad: %{text}<br>Pts: %{x}<br>Def_3rd: %{y}',
marker=dict(color=df['League'].apply(lambda x: color_dict[x]))),
row=1, col=1)
fig.add_trace(go.Scatter(x=df['Pts'], y=df['Mid_3rd'], mode='markers',
name='Middle 3rd', text=df['Squad'],
hovertemplate='Squad: %{text}<br>Pts: %{x}<br>Mid_3rd: %{y}',
marker=dict(color=df['League'].apply(lambda x: color_dict[x]))),
row=1, col=2)
fig.add_trace(go.Scatter(x=df['Pts'], y=df['Att_3rd'], mode='markers',
name='Attacking 3rd', text=df['Squad'],
hovertemplate='Squad: %{text}<br>Pts: %{x}<br>Att_3rd: %{y}',
marker=dict(color=df['League'].apply(lambda x: color_dict[x]))),
row=1, col=3)
fig.update_layout(
title='Scatterplots of Points vs. 3rd of the Field',
xaxis_title='Points',
yaxis_title='Tackles in the 3rd of the Field',
width=1200,
height=600
)
fig.show()
fig = px.scatter(df, x='Goals', y='Pts', trendline='ols', title='Scatter Plot Of Goals vs Points With A Regression Line', hover_name='Squad')
fig.update_layout(
xaxis_title='Goals',
yaxis_title='Points In Leauge'
)
fig.show()
fig = px.scatter(df, x='Goal_Creating_Actions', y='Goals', title='Goal Creating Actions vs Goals Scatter Plot', color='League', hover_name='Squad')
fig.update_layout(
xaxis_title='Goal Creating Actions',
yaxis_title='Goals'
)
fig.show()
df['Progresive_Plays'] = df['PrgC'] + df['PrgP']
fig = px.scatter(df, x='Progresive_Plays', y='Goals', title='Scatter Plot Of The Number of Progresive Plays vs The Number Of Goals', color='League', hover_name='Squad')
fig.update_layout(xaxis=dict(showticklabels=False))
fig.update_yaxes(title="Number of Goals")
fig.update_xaxes(title="Number of Progresive Plays")
fig.show()
df["TackleWP"] = df["TacklesW"]/df["Tackles"]
fig = px.scatter(df, x='TackleWP', y='GA', title='Scatter Plot Of The Percentage Of Successful Tackles vs Goals Scored Against', color='League', hover_name='Squad')
fig.update_yaxes(title="Goals Scored Against")
fig.update_xaxes(title="Tackles Won Percentage")
fig.show()
fig = px.violin(df, x='League', y='Worth',color='League')
fig.update_layout(
title='Violin Plot of The Distribution Of the Total Team Worth In Euros by League',
xaxis_title='League',
yaxis_title='Total Team Worth In Euros'
)
fig.show()
fig = px.scatter(df, x='GD', y='Pts', color='League', size='Attendance', hover_data=['Squad'])
fig.update_layout(
xaxis_title='Goal Difference',
yaxis_title='Points',
title='Scatter Plot and Line of Regression'
)
fig.update_layout(xaxis_range=[-50, 55], yaxis_range=[0, 90])
fig.show()
fig = px.scatter(df, x='YCard', y='GD', trendline='ols', color='League', hover_data=['Squad'])
fig.update_layout(xaxis_title='Yellow Cards', yaxis_title='Goals',title='Relationship Between The Number Of Yellow Card and Goals')
fig.show()
fig = px.scatter(df, x='#OfPlayers', y='GD', trendline='ols', color='League', hover_data=['Squad'])
fig.update_layout(xaxis_title='Number Of Players', yaxis_title='Goals',title='Relationship Between The Number Of Players and Goals')
fig.show()
fig = px.scatter(df, x='Age', y='GD', color='League', hover_data=['Squad'])
fig.update_layout(xaxis_title='Average Age of The Players', yaxis_title='Goals',title='Relationship Between The Average Age Of The Players and Goals')
fig.show()
fig = px.scatter(df, x=df.index, y='YCard', title='Yellow Card Scatter Plot', color='League', hover_name='Squad')
fig.update_layout(xaxis=dict(showticklabels=False))
fig.update_xaxes(title=None)
fig.update_yaxes(title="Yellow Card")
fig.show()
formula = "Pts ~ Worth + Clrearances + Blocks + PassLive + Top_Scorer_Goals + Tackles + Att_3rd - Errors + Poss + Progresive_Plays + Shot_Creating_Actions + TakeOnsTS + FoulTS - YCard + Attendance "
reg = smf.ols(formula=formula, data=df).fit()
reg.summary()
| Dep. Variable: | Pts | R-squared: | 0.688 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.640 |
| Method: | Least Squares | F-statistic: | 14.25 |
| Date: | Fri, 12 May 2023 | Prob (F-statistic): | 3.37e-16 |
| Time: | 19:43:24 | Log-Likelihood: | -349.29 |
| No. Observations: | 98 | AIC: | 726.6 |
| Df Residuals: | 84 | BIC: | 762.8 |
| Df Model: | 13 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | 19.0512 | 27.326 | 0.697 | 0.488 | -35.290 | 73.392 |
| Worth | 1.02e-08 | 6.97e-09 | 1.464 | 0.147 | -3.66e-09 | 2.41e-08 |
| Clrearances | -0.0024 | 0.017 | -0.139 | 0.890 | -0.037 | 0.032 |
| Blocks | -0.0203 | 0.039 | -0.521 | 0.604 | -0.098 | 0.057 |
| PassLive | 0.0045 | 0.064 | 0.070 | 0.944 | -0.123 | 0.132 |
| Top_Scorer_Goals | 0.7697 | 0.229 | 3.359 | 0.001 | 0.314 | 1.225 |
| Tackles | -0.0387 | 0.023 | -1.707 | 0.091 | -0.084 | 0.006 |
| Att_3rd | 0.1099 | 0.089 | 1.236 | 0.220 | -0.067 | 0.287 |
| Poss | 0.4094 | 0.401 | 1.021 | 0.310 | -0.388 | 1.206 |
| Progresive_Plays | -0.0004 | 0.008 | -0.055 | 0.957 | -0.016 | 0.015 |
| Shot_Creating_Actions | 0.0196 | 0.052 | 0.379 | 0.706 | -0.083 | 0.123 |
| TakeOnsTS | 0.1554 | 0.119 | 1.309 | 0.194 | -0.081 | 0.392 |
| FoulTS | -0.1569 | 0.163 | -0.960 | 0.340 | -0.482 | 0.168 |
| Attendance | -7.855e-06 | 6.89e-05 | -0.114 | 0.909 | -0.000 | 0.000 |
| Omnibus: | 1.718 | Durbin-Watson: | 1.834 |
|---|---|---|---|
| Prob(Omnibus): | 0.424 | Jarque-Bera (JB): | 1.583 |
| Skew: | 0.197 | Prob(JB): | 0.453 |
| Kurtosis: | 2.518 | Cond. No. | 1.10e+10 |
coef = reg.params.drop('Intercept')
coef.plot(kind='barh')
plt.xlabel('Coefficient')
plt.ylabel('Variable')
plt.title('Regression Coefficients')
plt.show()
# create a new column in the data frame to store the predicted values
df['predicted_pts'] = reg.predict(df)
fig = px.scatter(df, x='predicted_pts', y='Pts', title='Predicted vs Actual Pts')
fig.update_xaxes(title="Predicted Points")
fig.update_yaxes(title="Actual Points")
fig.show()
# create a new column in the data frame to store the residuals
df['residuals'] = reg.resid
# create a scatter plot of residuals vs predicted 'Pts'
fig = px.scatter(df, x='predicted_pts', y='residuals', title='Residual Plot')
fig.update_xaxes(title="Predicted Points")
fig.update_yaxes(title="Residuals")
# add a horizontal line at y=0 to indicate the zero residual line
fig.add_shape(type='line', x0=df['predicted_pts'].min(), y0=0, x1=df['predicted_pts'].max(), y1=0, line=dict(color='red'))
fig.show()
kmeans = KMeans(n_clusters=5, random_state=0).fit(df[['Poss', 'Worth', 'Progresive_Plays', 'Shot_Creating_Actions', 'Goals']])
# Add the cluster labels to the DataFrame
df['Cluster'] = kmeans.labels_
# Plot the results using Plotly Express
fig = px.scatter(df, x='GD', y='Pts', color='Cluster',
hover_data=['Squad', 'CL/REL', 'LgRk'])
fig.update_layout(title='K-means clustering with 5 clusters')
fig.show()
/usr/local/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
# Separate the target variable and the features
X = df[['Worth', 'SCA/90', 'PassLive', 'PassDead', 'TakeOnsTS', 'ShotTS', 'FoulTS', 'DefTS', 'GCA/90', '#OfPlayers', 'Age', 'Poss', 'YCard', 'RCard', 'PrgC', 'PrgP', 'Tackles', 'TacklesW', 'Def_3rd', 'Mid_3rd', 'Att_3rd', 'Blocks', 'Interceptions', 'Clrearances', 'Errors', 'Attendance', 'Progresive_Plays', 'TackleWP']]
y = df['GD']
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=42)
# Add the 'Squad' column to the test data set
test_data = X_test.copy()
test_data['Squad'] = df.loc[y_test.index, 'Squad']
# Create the Random Forest Regression model and fit the training data
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
# Make predictions on the testing data
y_pred = rf.predict(X_test)
# Evaluate the model using R-squared metric
r2 = rf.score(X_test, y_test)
print(f'R-squared: {r2:.2f}')
# Add the predicted values to the data frame with the 'Squad' column
df_pred = test_data[['Squad']].copy()
df_pred['Predicted_GD'] = y_pred
# Merge with the original data frame to get additional information
df_pred = pd.merge(df_pred, df[['Squad', 'GD']], on='Squad', how='left')
# Create a scatter plot of actual vs. predicted values
fig = px.scatter(df_pred, x='GD', y='Predicted_GD', trendline='ols', hover_data=['Squad'])
fig.update_layout(title='Graph Of Goal Difference vs Predicted Goal Difference')
fig.update_xaxes(title='Goal Differnce')
fig.update_yaxes(title='Predicted Goal Differnce')
fig.show()
R-squared: 0.77
The random forest model is a popular machine learning model used for regression. This model has an R^2 value of 0.77, which indicates that it is a good model. The model can be utilized to predict the goal difference a team has when provided with some statistical factors that are critical to a team's performance. As shown in the graph, many of the estimated goal difference values are close to the real goal difference values, indicating the effectiveness of the model in estimating the goal difference. Estimating the goal difference can be useful because it combines both the offensive aspect of the number of goals and the defensive aspect of the number of goals scored against, making it a crucial factor in predicting a team's performance. The random forest model is extremely useful, and more information about it can be found here.